The Species Distribution Model (SDM) Benchmark Study Part 2 presents data from eight bird species across Colorado, Oregon, North Carolina, and Vermont, using environmental rasters from Part 1. The analysis segments data distribution by state and species and evaluates explanatory rasters using methods like Principal Component Analysis and Factor Analysis for Mixed Data. Additionally, the study addresses pseudo-absence generation and conducts feature engineering/selection to prepare for subsequent modeling.
The initial phase of the exploratory data analysis visualizes bird species observations across four U.S. states: Oregon, Vermont, Colorado, and North Carolina, covering the period from 2016 to 2019. These visual representations display the distribution of eight distinct bird species within each state while also highlighting major cities for geographical context. Such visualizations offer insights into spatial distribution patterns, potentially revealing habitat preferences, migration routes, or the influence of urban areas on these distributions. As the analysis progresses, understanding these distributions becomes pivotal, especially when addressing spatial autocorrelations and other features that influence the accuracy and validity of the Species Distribution Model (SDM) benchmarks.
data(us.cities)
# Get major cities for each sample region (state)
.states <- c("OR", "VT", "CO", "NC")
top.cities <- purrr::map_df(.states, function(s) {
out <- us.cities %>%
filter(country.etc==s) %>%
mutate(city = gsub(paste0(" ", s), "", name)) %>%
arrange(-pop)
if (s == "OR") {
out <- out %>%
head() %>%
filter(!(city %in% c("Gresham", "Hillsboro", "Corvallis",
"Beaverton", "Springfield")))
} else if (s == "CO") {
out <- out %>%
head() %>%
filter(!(city %in% c("Thornton", "Lakewood", "Aurora")))
} else if (s == "NC") {
out <- out %>%
head() %>%
filter(!(city %in% c("Greensboro", "Durham", "Fayetteville")))
} else {
out <- out %>% head()
}
out
})
# Load the map data
states <- map_data("state") %>%
filter(region %in% c("oregon", "north carolina", "colorado", "vermont"))
# Load your data
data.files <- list.files("data/final", full.names = T)
df <- purrr::map_df(data.files, readRDS)
caps.after.ws <- function(string) {
gsub("(?<=\\s)([a-z])", "\\U\\1", string, perl = T)
}
# Define a function to create a plot for each species
plot.for.species <- function(spec, st.abbr) {
st <- case_when(st.abbr == "CO" ~ "colorado",
st.abbr == "NC" ~ "north carolina",
st.abbr == "VT" ~ "vermont",
st.abbr == "OR" ~ "oregon",
T ~ "")
title <- caps.after.ws(paste(st.abbr, gsub("_", " ", spec),
"Observations, 2016-2019"))
p <- ggplot(data = states %>% filter(region == st)) +
geom_polygon(aes(x = long, y = lat, group = group),
fill = "#989875", color = "black") +
geom_point(data = df %>% filter(state == st.abbr & common.name == spec),
aes(x = lon, y = lat),
size=1, alpha=.5, fill = "red", shape=21) +
geom_point(data = top.cities %>% filter(country.etc == st.abbr),
aes(x=long, y=lat),
fill="gold", color="black", size=3.5, shape = 21) +
geom_text(data = top.cities %>% filter(country.etc == st.abbr),
aes(x=long, y=lat, label=city),
color="white", hjust=case_when(st.abbr=="NC"~.2,
st.abbr=="VT"~.65,
T~.5),
vjust=ifelse(st.abbr=="NC", -.65, 1.5),
size=4) +
coord_map() +
ggtitle(title) +
theme_minimal() +
theme(panel.background = element_blank(),
axis.text = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
panel.grid = element_blank())
data.table(
state=st.abbr,
species=spec,
plot=list(p)
)
}
spec.state <- expand.grid(unique(df$common.name), unique(df$state)) %>%
rename(spec=Var1, st.abbr=Var2)
# Create a list of plots
plots <- purrr::map2_df(spec.state$spec,
spec.state$st.abbr,
~plot.for.species(.x, .y))
# Plot Ruddy Duck plots
do.call(ggpubr::ggarrange,
c(plots[species == "Ruddy Duck"]$plot,
list(nrow=2, ncol=2)))
# Plot Belted Kingfisher plots
do.call(ggpubr::ggarrange,
c(plots[species == "Belted Kingfisher"]$plot,
list(nrow=2, ncol=2)))
# Plot Wild Turkey plots
do.call(ggpubr::ggarrange,
c(plots[species == "Wild Turkey"]$plot,
list(nrow=2, ncol=2)))
# Plot Sharp-Shinned Hawk plots
do.call(ggpubr::ggarrange,
c(plots[species == "Sharp-shinned Hawk"]$plot,
list(nrow=2, ncol=2)))
# Plot Downy Woodpecker Plots
do.call(ggpubr::ggarrange,
c(plots[species == "Downy Woodpecker"]$plot,
list(nrow=2, ncol=2)))
# Plot Cedar Waxwing Plots
do.call(ggpubr::ggarrange,
c(plots[species == "Cedar Waxwing"]$plot,
list(nrow=2, ncol=2)))
# Plot Sandhill Crane Plots
do.call(ggpubr::ggarrange,
c(plots[species == "Sandhill Crane"]$plot,
list(nrow=2, ncol=2)))
# Plot Sanderling Plots
do.call(ggpubr::ggarrange,
c(plots[species == "Sanderling"]$plot,
list(nrow=2, ncol=2)))
The environmental rasters from Colorado, North Carolina, Oregon, and Vermont used in this study are meant to provide insights into which factors might potentially influence bird species distributions. The rasters were prepared in Part 1 of this study, and saved into 4 state-wide multi-layer rasters, each with 1000x1000 meter resoultions.
# Load prepared explanatory rasters by state
states <- c("CO", "NC", "OR", "VT")
r.files <- paste0("data/final_rasters/", states, ".tif")
r.list <- purrr::map(r.files, rast)
names(r.list) <- states
The table below shows the frequency distribution of various land cover types across the selected states. By evaluating this, the most common and rare habitats in each region can be ascertained. Moreover, understanding the prevalence of each land cover type can provide insights into potential hot-spots or habitats of interest for the bird species in question.
purrr::map_df(states, function(st) {
freq.df <- terra::freq(r.list[[st]]$NLCD_Land) %>%
mutate(state = st) %>%
dplyr::select(state, value, count)
})
Continuous rasters capture a gradient of values, often related to environmental gradients such as temperature, precipitation, or elevation. In this study, the distribution of such continuous variables for each state is illustrated using density plots. These plots provide a visual representation of the distribution patterns of each environmental variable, allowing for an assessment of the variations and patterns within each state. For instance, understanding the elevation range of Oregon compared to Colorado can offer insights into the altitudinal preferences of species. Additionally, spotting any anomalies or peaks in these distributions might suggest specific ecological or environmental significance, further aiding in the understanding and interpretation of model outputs.
# Plot densities of numeric rasters, by state
purrr::map(states, function(st) {
r <- r.list[[st]]
plots <- purrr::map(names(r)[names(r) != "NLCD_Land"], function(l) {
ggplot(data=r[l] %>% as.data.frame()) +
geom_density(aes(x=!!sym(l))) +
theme_bw() +
ggtitle(paste0(st, " Distribution for ", gsub(paste0("_", st), "", l)))
})
ggpubr::ggarrange(plotlist=plots, ncol=3, nrow=4)
}) %>%
ggpubr::ggarrange(plotlist=., ncol=1, nrow=4) +
ggtitle(paste0("Density Plots of Numeric Variables"))
In this section, the study employs Principal Component Analysis (PCA)
to transform the high-dimensional data into a lower-dimensional form
while retaining as much of the original variance as possible. The
initial step involves merging data from different states into a single
dataframe, followed by cleaning factor levels of the
NLCD_Land column. Subsequently, the code converts factor
columns into dummy variables and removes columns with only a single
unique value. The PCA() function is used to perform the
PCA, and the results are visualized using bar charts, displaying the
importance of each variable for the first two dimensions. This
visualization is important in understanding the significance of
different environmental rasters and how they contribute to the
variations captured by the principal components.
r.df <- map_df(states, function(s) {
df <- r.list[[s]] %>% as.data.frame()
names(df) <- names(df) %>% gsub(paste0("_", s), "", .)
df %>% setDT()
df[, state := factor(s, levels=states)]
df[apply(df, 1, function(.x) !any(is.na(.x)))]
})
# Custom function to process factor levels
clean.levels <- function(x) {
# Remove non-alphanumeric characters and replace with underscores
x <- gsub("[^a-zA-Z0-9]", "_", x)
# Convert to lowercase
x <- tolower(x)
# Remove any leading or trailing underscores
x <- gsub("^_|_$", "", x)
x <- gsub("__", "_", x)
x <- gsub("NLCD_Land_", "", x)
return(x)
}
r.df[, NLCD_Land := factor(clean.levels(levels(NLCD_Land))[NLCD_Land])]
# Convert factor columns to dummy variables
df.dummies <- data.table(model.matrix(~ . - 1,
data = r.df[, .(NLCD_Land, state)])) %>%
cbind(r.df[, -which(names(r.df) %in% c("NLCD_Land", "state")), with=F])
names(df.dummies) <- gsub("NLCD_Land", "", names(df.dummies))
# Ensure that there is more than one value per column (remove otherwise)
uniq.1 <- t( df.dummies[, lapply(.SD, uniqueN)]) %>%
as.data.frame() %>%
filter(V1 == 1) %>%
row.names()
if (length(uniq.1) >= 1) {
df.dummies <- df.dummies[, -which(names(df.dummies) %in% uniq.1), with=F]
}
pca.fit <- PCA(df.dummies, graph=F)
plot.PCA(pca.fit, choix="var")
res <- pca.fit$var$coord %>%
as.data.frame() %>%
mutate(var=as.factor(rownames(.))) %>%
dplyr::select(var, everything()) %>%
as_tibble()
rownames(res) <- NULL
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
geom_bar(stat = "identity", fill="darkblue") +
coord_flip() + # Makes it a horizontal bar chart
labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
theme_minimal()
p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
geom_bar(stat = "identity", fill="darkred") +
coord_flip() + # Makes it a horizontal bar chart
labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
theme_minimal()
ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)
Diving deeper into the structure of the data, Factor Analysis for
Mixed Data (FAMD) is implemented next. FAMD is a unique technique that
extends traditional Factor Analysis by handling both continuous and
categorical data, which fits the nature of the dataset used in this
study. The code initiates this process by applying the
FAMD() function to the dataframe. The results are then
visualized in three distinct plots that highlight the variance explained
by different variables for both continuous and categorical data. These
plots provide insights into the underlying structure of the data and the
significance of each variable. By observing the variable importance for
the first two dimensions, displayed through bar charts, it can be
discern which variables are related. Later, multi-collinearity and
autocorrelation will be addressed in greater depth.
famd.fit <- FAMD(r.df, graph=F)
ggpubr::ggarrange(plotlist=purrr::map(
c("var", "quanti", "quali"),
~plot.FAMD(famd.fit, choix=.x)),
ncol=1, nrow=3)
res <- famd.fit$var$coord %>%
as.data.frame() %>%
mutate(var=as.factor(rownames(.))) %>%
dplyr::select(var, everything()) %>%
as_tibble()
rownames(res) <- NULL
p.d1 <- ggplot(res, aes(x = reorder(var, Dim.1), y = Dim.1)) +
geom_bar(stat = "identity", fill="darkblue") +
coord_flip() + # Makes it a horizontal bar chart
labs(title = "Variable importance for Dim.1", y = "Importance", x = "Variable") +
theme_minimal()
p.d2 <- ggplot(res, aes(x = reorder(var, Dim.2), y = Dim.2)) +
geom_bar(stat = "identity", fill="darkred") +
coord_flip() + # Makes it a horizontal bar chart
labs(title = "Variable importance for Dim.2", y = "Importance", x = "Variable") +
theme_minimal()
ggpubr::ggarrange(plotlist=list(p.d1, p.d2), nrow=2, ncol=1)
In species distribution modeling, the presence of a species in certain locations is often well-recorded, but the absence is typically under-reported or not reported at all. This creates a challenge when trying to understand the complete distribution of a species. To address this, the concept of pseudo-absence data is introduced. Pseudo-absence data are artificially generated points that represent locations where the species is assumed not to be present. In this section, pseudo-absence data points are generated for the eight bird species across the four states ensuring that these points do not overlap with observed presence data. The generated pseudo-absence data helps in providing a more comprehensive view of the species distribution and serves as an essential component for the upcoming modeling process.
For the method used for generating pseudo-absence data in this analysis, initially a buffer is created around each observation point, each with a 5000 meter radius. This buffer essentially “blocks” the regions of the sample-area from where pseudo-absence points can be sampled. I.e., only the non-buffered zones can be sampled from.
# Set up output directory
output.dir <- "artifacts/masks_5k"
if (!dir.exists("artifacts")) dir.create("artifacts")
if (!dir.exists(output.dir)) dir.create(output.dir)
# Load Data
# Get raster data
states <- c("CO", "NC", "OR", "VT")
r.list <- purrr::map(paste0("data/final_rasters/", states, ".tif"), rast)
names(r.list) <- states
# Get observation data
obs.df <- list.files("data/final", full.names = T) %>%
purrr::map_df(readRDS) %>%
dplyr::select(state, common.name, observation.point=geometry)
# Buffering Raster Data
dist <- 5e3
mask.update <- function(i, mask.raster, obs.df, obs.field="observation.point",
dist=10000, u="m") {
obs.pt <- st_transform(obs.df[i, "observation.point"], st_crs(mask.raster))
# Create a buffer around the point, ensuring correct units
buf <- st_buffer(obs.pt, dist=units::as_units(paste(dist, u)))
return(terra::rasterize(buf, mask.raster, update=T, field=1))
}
# For each observation point, you can now create a distance
# raster and then mask cells within the buffer distance
get.buffered.zones <- function(r, obs.df, obs.field="observation.point",
dist=10000, u="m") {
# Create an empty raster with the same extent and resolution as r
mask.raster <- terra::rast(r)
# Recursively update mask.raster with additional buffered regions
for(i in 1:nrow(obs.df)) {
mask.raster <- mask.update(i, mask.raster, obs.df,
obs.field=obs.field,
dist=dist, u=u)
gc()
}
return(mask.raster)
}
# Get masks by state, species
masks <- purrr::map(states, function(state) {
specs <- sort(unique(obs.df$common.name))
spec.masks <- purrr::map(specs, function(spec, st=state) {
fname <- file.path(output.dir, paste0(st, "_", spec, ".tif"))
if (file.exists(fname)) {
cat("Reading", spec, st, "mask from", fname, "\n")
r.mask <- rast(fname)
} else {
cat("Computing", spec, st, "mask, and saving to", fname, "\n")
r.mask <- get.buffered.zones(r=r.list[[st]],
obs.df=filter(obs.df, state == st &
common.name == spec),
dist=dist)
terra::writeRaster(r.mask, fname, overwrite=T)
}
gc()
r.mask
}, .progress=T)
names(spec.masks) <- specs
spec.masks
})
names(masks) <- states
This section identifies areas outside the buffers as potential zones for pseudo-absences. To ensure an accurate representation, a random sampling mechanism is implemented. For each bird species in a state, an equivalent number of pseudo-absence points (or a pre-defined minimum threshold), based on observed data are generated. The result is a set of coordinates representing regions where the bird species have not been observed.
# Set seed
set.seed(19)
# Function to sample n points from the non-masked parts
sample.inverse.mask <- function(r.original, r.mask, n,
species, state,
sample.crs=4326, min.n=500,
output.dir="artifacts/pseudo_absence_regions") {
if (!dir.exists(output.dir)) dir.create(output.dir)
output.path <- file.path(output.dir,
gsub(" |\\-", "_",
paste0(
paste(state, tolower(species), sep="_"),
".tif")
))
if (!file.exists(output.path)) {
# Get inverse mask;
# Set NA cells to 0, keep 0 cells as 0, change other cells to 1
r.inverse <- terra::ifel(is.na(r.mask), 0, r.mask)
# Set 0 to 1 and everything else to NA
r.inverse <- terra::lapp(r.inverse, fun = function(x) ifelse(x == 0, 1, NA))
# Crop so that anything outside of the state is NA
r.cropped <- terra::crop(r.inverse, terra::ext(r.original))
# Create a binary raster from r.original where valid values are
# set to 1 and NA values remain NA
r.binary <- terra::lapp(r.original[[1]],
fun = function(x) ifelse(!is.na(x), 1, NA))
# Multiply the cropped raster by the binary raster to ensure
# outside values are set to NA
r.final <- r.cropped * r.binary
terra::writeRaster(r.final, output.path, overwrite=T)
} else {
r.final <- terra::rast(output.path)
}
# Convert the raster to SpatialPoints
gdf <- terra::as.points(r.final) %>%
st_as_sf() %>%
st_transform(crs=sample.crs)
if (nrow(gdf) > 0) {
gdf <- gdf %>%
filter(!is.na(layer)) %>%
select(geometry)
} else {
return(gdf)
}
# Set to min.n size if n < min.n
if (n < min.n) n <- min.n
# Make sure there is sufficient available sample points
if (n > nrow(gdf)) n <- nrow(gdf)
# Randomly sample n points from the available (non-masked) space
sample.idx <- sample(1:nrow(gdf), n)
samples <- gdf[sample.idx,] %>%
mutate(common.name = species,
state = state,
lon = NA,
lat = NA,
observations=0)
# Populate lon and lat values:
coords <- st_coordinates(samples)
samples$lon <- coords[, "X"]
samples$lat <- coords[, "Y"]
return(samples)
}
# Get totals by species and state
totals <- obs.df %>%
as_tibble() %>%
select(state, common.name) %>%
group_by(state, common.name) %>%
summarize(N=n(), .groups="keep")
if (!dir.exists(file.path("data", "final_pseudo_absence"))) {
dir.create(file.path("data", "final_pseudo_absence"))
}
if (!all(
file.exists(
paste0(file.path("data", "final_pseudo_absence",
paste0("pa_", states, ".rds")))
)
)) {
# Create a list of pseudo absence points, by species and state,
# where the sample number `n` >= 500 | `n` == the total observed
# for each respective state and species
pseudo.absence.pts <- list()
for (st in states) {
r.original <- r.list[[st]]
r.masks <- masks[[st]]
pseudo.absence.pts[[st]] <- list()
for (spec in names(r.masks)) {
r.mask <- r.masks[[spec]]
n <- totals %>% filter(state == st & common.name == spec) %>% pull(N)
cat("Generating", n, "pseudo-absence points for the", spec, "in", st, "\n")
pseudo.absence.pts[[st]][[spec]] <- sample.inverse.mask(
r.original, r.mask, spec, st, n=n, sample.crs=4326)
cat("\tGenerated", nrow(pseudo.absence.pts[[st]][[spec]]), "/", n, "points.\n")
}
}
# Extract raster values for each point
for (state in states) {
out.file.all <- file.path("data", "final_pseudo_absence", paste0("pa_", state, ".rds"))
if (!file.exists(out.file.all)) {
r <- r.list[[state]]
cat(sprintf("Extracting points to values for %s...\n", state))
# Load observations shapefile
geo.df <- pseudo.absence.pts[[state]] %>% do.call("rbind", .)
rownames(geo.df) <- NULL
geo.df.crs <- st_crs(geo.df)
# Define target CRS and update
target.crs <- st_crs(r)
cat(sprintf("Updating CRS for %s...\n", state))
geo.df <- st_transform(geo.df, target.crs)
# Extract raster values
for (r.name in names(r)) {
cat("\tExtracting", r.name, "values for", state, "\n")
x <- terra::extract(r[[r.name]], geo.df)[[r.name]]
geo.df[[gsub(paste0("_", state), "", r.name)]] <- x
}
# Update crs back
geo.df <- st_transform(geo.df, geo.df.crs)
# Fix names; Filter NA values
geo.df <- geo.df %>%
filter(dplyr::if_all(names(.), ~!is.na(.))) %>%
suppressWarnings()
saveRDS(geo.df, out.file.all)
cat("--------------\n")
}
}
}
The table below compares the generated pseudo-absence data with the observed dataset. This comparison ensures that the number of pseudo-absence points matches the observed ones, balancing the dataset and making it ready for the next analysis steps.
# Get all pseudo-absence data
abs.df <- list.files(file.path("data", "final_pseudo_absence"), full.names = T) %>%
purrr::map_df(readRDS) %>%
select(state, common.name, observation.point=geometry)
# There might be some slight differences since there are occasionally NA values
abs.df %>%
as_tibble() %>%
select(state, common.name) %>%
group_by(state, common.name) %>%
summarize(psuedo.absence.N=n(), .groups="keep") %>%
left_join(totals, by=c("state", "common.name")) %>%
rename(observed.N = N) %>%
knitr::kable()
| state | common.name | psuedo.absence.N | observed.N |
|---|---|---|---|
| CO | Belted Kingfisher | 4523 | 4551 |
| CO | Cedar Waxwing | 3431 | 3446 |
| CO | Downy Woodpecker | 7440 | 7511 |
| CO | Ruddy Duck | 1715 | 1726 |
| CO | Sanderling | 497 | 131 |
| CO | Sandhill Crane | 1524 | 1532 |
| CO | Sharp-shinned Hawk | 2241 | 2257 |
| CO | Wild Turkey | 2597 | 2611 |
| NC | Belted Kingfisher | 4042 | 4183 |
| NC | Cedar Waxwing | 4059 | 4195 |
| NC | Downy Woodpecker | 10415 | 10914 |
| NC | Ruddy Duck | 1076 | 1106 |
| NC | Sanderling | 488 | 311 |
| NC | Sandhill Crane | 489 | 118 |
| NC | Sharp-shinned Hawk | 1211 | 1254 |
| NC | Wild Turkey | 2261 | 2372 |
| OR | Belted Kingfisher | 5803 | 5837 |
| OR | Cedar Waxwing | 8405 | 8446 |
| OR | Downy Woodpecker | 8529 | 8576 |
| OR | Ruddy Duck | 1996 | 2010 |
| OR | Sanderling | 496 | 258 |
| OR | Sandhill Crane | 2443 | 2458 |
| OR | Sharp-shinned Hawk | 2696 | 2714 |
| OR | Wild Turkey | 2429 | 2440 |
| VT | Belted Kingfisher | 1956 | 2033 |
| VT | Cedar Waxwing | 1098 | 3938 |
| VT | Downy Woodpecker | 1598 | 4635 |
| VT | Ruddy Duck | 490 | 51 |
| VT | Sanderling | 493 | 39 |
| VT | Sandhill Crane | 492 | 76 |
| VT | Sharp-shinned Hawk | 730 | 748 |
| VT | Wild Turkey | 2090 | 2181 |
This is a common measure of global spatial autocorrelation. A positive Moran’s I suggests clustering, a negative value suggests dispersion, and a value near zero suggests randomness. In other words, this is randomization approach to test for spatial autocorrelation. It is essentially checking if the data has a spatial pattern that is significantly different from what would be expected if the values were randomly distributed in space.
Moran I statistic: This is the calculated Moran’s I value for the data, which quantifies the degree of spatial autocorrelation. A positive value indicates positive autocorrelation (similar values are closer together), while a negative value indicates negative autocorrelation (dissimilar values are closer together). A value close to zero indicates a random spatial pattern.
Moran I statistic standard deviate: This value represents the standardized Moran’s I value. The larger the absolute value of this statistic, the stronger the evidence against the null hypothesis (of no spatial autocorrelation). A positive value indicates positive spatial autocorrelation, suggesting clustering of similar values.
P-value: This is the probability of observing a Moran’s I value as extreme as, or more extreme than, the one computed from the data, assuming the null hypothesis of no spatial autocorrelation is true. A very small p-value provides strong evidence to reject the null hypothesis, indicating significant spatial autocorrelation in your data.
Expectation: This is the expected Moran’s I value under the null hypothesis of no spatial autocorrelation. For a large dataset, it’s typically close to zero.
Variance: This is the variance of the Moran’s I statistic under the null hypothesis.
if (!dir.exists("artifacts/obs_autocor_morans")) {
dir.create("artifacts/obs_autocor_morans")
}
# Get observation data
df <- list.files(file.path("data", "final"), full.names = T) %>%
purrr::map_df(readRDS) %>%
select(state, common.name, observations, geometry) # We only interested in these data here
states <- sort(unique(df$state))
species <- sort(unique(df$common.name))
# Function to extract the desired results from output
extract.data <- function(st, spec, results) {
data_frame(
state = st,
species = spec,
statistic = results$statistic,
p.value = results$p.value,
moran.i.statistic = results$estimate['Moran I statistic'],
expectation = results$estimate['Expectation'],
variance = results$estimate['Variance']
)
}
# Define the k for knn computation
k <- 5
if (!file.exists("artifacts/obs_autocor_morans/mi.results.rds")) {
# Perform test by state, by species
mi.results <- list()
for (st in states) {
mi.results[[st]] <- list()
for (spec in species) {
cat("Doing Moran's test for", spec, "in", st, "\n")
# Filter data
d <- df %>% filter(common.name == spec & state == st)
mi.results[[st]][[spec]]$data <- d
# Fit knn model
knn <- d %>%
knearneigh(k = k)
mi.results[[st]][[spec]]$knn <- knn
# Create a neighbor's list
nb <- knn %>%
knn2nb()
mi.results[[st]][[spec]]$nb <- nb
# Create a spatial weights matrix
listw <- nb2listw(nb)
mi.results[[st]][[spec]]$listw <- listw
# Compute Moran's I
results <- moran.test(d$observations, listw)
mi.results[[st]][[spec]]$moran.test.results <- extract.data(st, spec, results)
}
}
saveRDS(mi.results, "artifacts/obs_autocor_morans/mi.results.rds")
} else {
mi.results <- readRDS("artifacts/obs_autocor_morans/mi.results.rds")
}
spec.stat <- expand.grid(species=species, state=states, stringsAsFactors=F)
mi.results.df <- purrr::map(1:nrow(spec.stat), function(i) {
spec <- spec.stat[i, ]$species
st <- spec.stat[i, ]$state
mi.results[[st]][[spec]]$moran.test.results
}) %>%
do.call("rbind", .) %>%
as_tibble() %>%
# Compute adjusted p-value, accounting for multiple comparisons
mutate(adj.p.value = p.adjust(p.value, method="holm"))
# Filter to where the adjusted p-value <= 0.05; sort by moran i stat
mi.results.df %>%
filter(adj.p.value <= 0.05) %>%
arrange(-moran.i.statistic)
The Moran’s I statistic for the Belted Kingfisher and Cedar Waxwing are positive in Oregon, as well as the Sharp-shinned Hawk in both Vermont and Oregon. This suggests that locations with high observations of these species are close to other locations with high observations, and the same for low observations. In simple terms, the observation patterns of these species are clustered.
Below is an example of the actual observations of the Sharp-shinned Hawk in Vermont against the spatial lag (i.e., the weighted sum of the neighboring values at each point). Consider the following when interpreting the plot:
# Get spatial weights list
vt.ssh <- mi.results[["VT"]][["Sharp-shinned Hawk"]]$data
# Calculate the spatial lag of the observations
vt.ssh$spatial.lag <- lag.listw(
mi.results[["VT"]][["Sharp-shinned Hawk"]]$listw,
vt.ssh$observations)
# Scatter plot for the Sharp-Shinned Hawk in Vermont
ggplot(vt.ssh, aes(x=observations, y=spatial.lag)) +
geom_point() +
geom_smooth(method="lm", col="red") + # Adds a linear regression line
ggtitle("Moran Scatter Plot for Sharp-shinned Hawk in VT") +
xlab("Observations (log10 Scale)") +
ylab("Spatial Lag of Observations (log10 Scale)") +
scale_y_log10() + scale_x_log10()
env.df.list <- list()
for (state in states) {
r <- r.list[[state]]
gdf <- terra::as.points(r) %>%
st_as_sf() %>%
st_transform(crs=4326)
# Fix names
names(gdf) <- gsub(paste0("_", state, "$"), "", names(gdf))
# Convert land cover to binary variables
binary.lc <- caret::dummyVars(~NLCD_Land, data=gdf, sep=".") %>%
predict(., gdf) %>%
as.data.frame()
names(binary.lc) <- gsub("NLCD_Land", "", names(binary.lc)) %>%
gsub("\\/| |,", "_", .) %>%
gsub("__", "_", .) %>%
tolower()
gdf <- gdf %>% select(-NLCD_Land) %>% cbind(binary.lc)
env.df.list[[state]] <- gdf
}
# Perform test by state, by environmental variable
env.mi.results <- list()
output.dir <- file.path("artifacts", "env_autocor_morans")
if (!dir.exists(output.dir)) dir.create(output.dir)
for (st in states) {
env.mi.results[[st]] <- list()
gdf <- env.df.list[[st]]
env.vars <- names(gdf)[names(gdf) != "geometry"]
for (e in env.vars) {
output.path <- file.path(output.dir, paste0(st, "_", e, ".rds"))
if (!file.exists(output.path)) {
cat("Doing Moran's test for", e, "in", st, "\n")
# Filter data
d <- gdf %>% select(!!sym(e))
# env.mi.results[[st]][[e]]$data <- d
n.distinct <- d %>% pull(!!sym(e)) %>% unique() %>% length()
if (n.distinct > 1) {
# Fit knn model
knn <- d %>%
knearneigh(k = k)
# env.mi.results[[st]][[e]]$knn <- knn
# Create a neighbor's list
nb <- knn %>%
knn2nb()
# env.mi.results[[st]][[e]]$nb <- nb
# Create a spatial weights matrix
listw <- nb2listw(nb)
# env.mi.results[[st]][[e]]$listw <- listw
# Compute Moran's I
results <- moran.test(d[[e]], listw)
env.mi.results[[st]][[e]]$moran.test.results <- extract.data(st, e, results)
}
cat("\tSaving results...\n")
saveRDS(env.mi.results[[st]][[e]]$moran.test.results, output.path)
} else {
cat("Getting Moran's test results for", e, "in", st, "\n")
env.mi.results[[st]][[e]]$moran.test.results <- readRDS(output.path)
}
}
}
env.stat <- expand.grid(env=names(env.df.list$CO )[names(env.df.list$CO) != "geometry"],
state=states, stringsAsFactors=F)
env.mi.results.df <- purrr::map(1:nrow(env.stat), function(i) {
e <- env.stat[i, ]$env
st <- env.stat[i, ]$state
env.mi.results[[st]][[e]]$moran.test.results
}) %>%
do.call("rbind", .) %>%
as_tibble() %>%
# Compute adjusted p-value, accounting for multiple comparisons
mutate(adj.p.value = p.adjust(p.value, method="holm"))
# Filter to where the adjusted p-value <= 0.05; sort by moran i stat
env.mi.results.df %>%
filter(adj.p.value <= 0.05) %>%
arrange(-moran.i.statistic)
It is apparent that many of the environmental variables exhibit strong spatial clustering patterns. Notably, the weather layers have Moran’s I values nearing 1, signifying near-perfect spatial autocorrelation. Such patterns, while anticipated for certain types of variables, can introduce challenges in statistical analyses. Specifically, strong spatial autocorrelation often violates the assumption of observation independence fundamental to many traditional statistical models, potentially leading to biased parameter estimates and misleading significance levels. However, this spatial dependence can be accounted for using specialized spatial statistical methods or integrated as inherent structure in certain modeling techniques. It’s crucial to select an appropriate modeling method that accommodates or leverages this spatial structure, ensuring reliable and valid results.
Multi-collinearity is addressed in the following section, while the other mitigation solutions will be implemented either during the modeling stage of the study (e.g., cross-validation), or else addressed during the feature engineering/selection portion.
all.columns <- unique(unlist(lapply(env.df.list, names)))
env.df <- lapply(env.df.list, function(df) {
missing.cols <- setdiff(all.columns, names(df))
for (col in missing.cols) {
df[[col]] <- NA
}
return(df)
}) %>%
do.call("rbind", .) %>%
as.data.frame() %>%
select(-geometry, -unclassified)
corr.matrix <- cor(env.df, use="na.or.complete")
eigenvalues <- eigen(corr.matrix)$values
ci.df <- tibble(
variable=names(env.df),
condition.index=sqrt(max(eigenvalues) / eigenvalues)
)
ci.df
# Extract the eigenvectors
eigenvectors <- eigen(corr.matrix)$vectors
threshold <- 30
large.ci.indices <- which(sqrt(max(eigenvalues) / eigenvalues) > threshold)
# Examine the eigenvectors
for (index in large.ci.indices) {
cat(paste("Eigenvalue:", eigenvalues[index]), "\n")
cat(paste("Condition Index:",
sqrt(max(eigenvalues) / eigenvalues[index])), "\n")
# Sorting eigenvector components by absolute magnitude for clarity
ev <- eigenvectors[, index]
sorted.ev <- sort(abs(ev), decreasing = T)
ev.top <- sorted.ev[1:5] %>%
as_tibble() %>%
mutate(variable=rownames(corr.matrix)[order(-abs(ev))][1:5]) %>%
select(variable, value)
cat("\nTop 5 contributors to multicollinearity at the condition idx:\n")
for (row in 1:5) {
cat("\t", ev.top[row,]$variable, ":", signif(ev.top[row,]$value, 3), "\n")
}
cat("\n------------------------\n")
}
## Eigenvalue: 0.00135666182336175
## Condition Index: 65.3042479620253
##
## Top 5 contributors to multicollinearity at the condition idx:
## avg_prcp : 0.814
## tmax : 0.459
## tmin : 0.356
## dem : 0.0108
## Winter_NDVI : 0.00657
##
## ------------------------
## Eigenvalue: 1.97064586234132e-15
## Condition Index: 54184234.4382929
##
## Top 5 contributors to multicollinearity at the condition idx:
## shrub_scrub : 0.508
## evergreen_forest : 0.5
## herbaceous : 0.473
## cultivated_crops : 0.318
## deciduous_forest : 0.207
##
## ------------------------
corrplot::corrplot(corr.matrix)
As described previously, feature engineering is an essential step in predictive modeling, as it can help to reduce possible problems due to multi-collinearity or spatial autocorrelation. In addition, it can help reduce overfitting, as well as provide new insights unavailable through basic pre-processing of the data.
First, create a space to output any “engineered” rasters:
output.dir <- "artifacts/feature_engineering"
if (!dir.exists(output.dir)) dir.create(output.dir)
Recall the hierarchical categories for the land cover data:
Using these categories, feature reduction can be accomplished using a heuristic methodology. Other redundant rasters, such as the Waterbody and Urban Imperviousness rasters, can also be combined with the respective related land cover rasters to further reduce multicollinearity between rasters.
create.parent.category.rasters <- function(input.raster,
state,
output.dir,
layer.name="NLCD_Land") {
# Define the category mappings
categories <- list(
water = c("Open Water"),
ice.snow = c("Perennial Ice/Snow"),
developed = c("Developed, Open Space",
"Developed, Low Intensity",
"Developed, Medium Intensity",
"Developed, High Intensity"),
barren = c("Barren Land"),
forest = c("Mixed Forest",
"Deciduous Forest",
"Evergreen Forest"),
shrubland = c("Shrub/Scrub", "Dwarf Shrub"),
herbaceous = c("Herbaceous", "Grassland/Herbaceous",
"Sedge/Herbaceous", "Lichens", "Moss"),
planted.cultivated = c("Cultivated Crops", "Hay/Pasture"),
wetlands = c("Woody Wetlands", "Emergent Herbaceous Wetlands")
)
# Iterate through each category to create and save the binary raster
for (cat.name in names(categories)) {
# Define the output file path
output.file <- file.path(output.dir, paste0(state, "_", cat.name, ".tif"))
if (!file.exists(output.file)) {
cat("Generating raster for", cat.name, "in", state, "\n")
if (cat.name != "developed") {
# Create a binary raster for the current category
out.raster <- terra::lapp(input.raster[[layer.name]],
fun = function(x) {
case_when(
is.na(x) ~ NA,
x %in% categories[[cat.name]] ~ 1.,
T ~ 0)
})
names(out.raster) <- cat.name
if (cat.name == "water") {
# Combine with waterbody raster layer
wb.raster <- input.raster[[paste0("waterbody_", state)]]
out.raster <- (out.raster + wb.raster) / 2
names(out.raster) <- "water"
}
} else {
# Set developed raster to be a value, scale ranging from 0.25-1 by 0.25
out.raster <- terra::lapp(input.raster[[layer.name]],
fun = function(x) {
case_when(
is.na(x) ~ NA,
x == "Developed, Open Space" ~ 0.25,
x == "Developed, Low Intensity" ~ 0.5,
x == "Developed, Medium Intensity" ~ 0.75,
x == "Developed, High Intensity" ~ 1.,
T ~ 0)
})
# Combine with urban imperviousness
ui.raster <- input.raster[[paste0("urban_imperviousness_", state)]]
ui.min <- terra::minmax(ui.raster)[[1]]
ui.max <- terra::minmax(ui.raster)[[2]]
ui.raster.scaled <- (ui.raster - ui.min) / (ui.max - ui.min)
out.raster <- (out.raster + ui.raster.scaled) / 2
names(out.raster) <- "developed"
}
# Save the output raster to the specified output path
writeRaster(out.raster, output.file, overwrite=T)
}
}
}
for (state in states) {
create.parent.category.rasters(r.list[[state]], state, output.dir)
}
NDVI, or Normalized Difference Vegetation Index, is a common measure of vegetation health. For each state, there are four separate NDVI rasters representing different seasons. Aggregating these rasters provides two essential insights: the minimum and maximum NDVI values. The minimum NDVI may correspond to the period when vegetation is least vigorous (e.g., winter or drought), while the maximum may indicate the period of peak vegetation growth (e.g., spring or summer). By scaling these values between 0 and 1, and then storing them separately as minimum and maximum NDVI rasters, a standardized measure of vegetation dynamics throughout the year for each state is achieved.
# Convert 4 separate NDVI rasters to a single raster
for (state in states) {
output.file.min <- file.path(output.dir, paste0(state, "_min_NDVI.tif"))
output.file.max <- file.path(output.dir, paste0(state, "_max_NDVI.tif"))
if (!file.exists(output.file.min) | !file.exists(output.file.max)) {
r.ndvi <- r.list[[state]][[names(r.list[[state]]) %like% "NDVI"]]
# Parse seasons
for (s in names(r.ndvi)) {
r <- r.ndvi[[s]]
.min <- terra::minmax(r)[[1]]
.max <- terra::minmax(r)[[2]]
# Scale to be from 0 to 1
r.ndvi[[s]] <- (r - .min) / (.max - .min)
}
# Get min/max scaled NDVI values
r.max <- max(r.ndvi)
names(r.max) <- "max.ndvi"
r.min <- min(r.ndvi)
names(r.min) <- "min.ndvi"
writeRaster(r.max, output.file.max, overwrite=T)
writeRaster(r.min, output.file.min, overwrite=T)
}
}
In this section, the difference between the maximum and minimum temperatures for each state is calculated. This temperature difference, often referred to as diurnal temperature range, can provide insights into the variability of daily temperatures. A high difference may suggest drastic temperature changes from day to night, which can impact ecosystems and human health. By creating a scaled raster of these differences (ranging from 0 to 1), the temperature variability can be compared across different states.
# Get the difference between the min and max temperatures as a raster
for (state in states) {
output.file <- file.path(output.dir, paste0(state, "_tdiff.tif"))
if (!file.exists(output.file)) {
# Get difference
r.tdiff <- r.list[[state]][[paste0("tmax_", state)]] -
r.list[[state]][[paste0("tmin_", state)]]
# Get min/max differences
.min <- terra::minmax(r.tdiff)[[1]]
.max <- terra::minmax(r.tdiff)[[2]]
# Scale to be from 0 to 1
r.tdiff <- (r.tdiff - .min) / (.max - .min)
names(r.tdiff) <- "temp.diff"
# Write raster
writeRaster(r.tdiff, output.file, overwrite=T)
}
}
These are components derived from the spatial coordinates, which can capture and account for spatial structures in the data. E.g., a polynomial term based on latitude and longitude could account for some of the spatial trend.
for (state in states) {
output.file.lon <- file.path(output.dir, paste0(state, "_lon.tif"))
output.file.lat <- file.path(output.dir, paste0(state, "_lat.tif"))
output.file.lon2 <- file.path(output.dir, paste0(state, "_lon_poly.tif"))
output.file.lat2 <- file.path(output.dir, paste0(state, "_lat_poly.tif"))
output.file.lli <- file.path(output.dir, paste0(state, "_lon_lat_interaction.tif"))
if (!all(file.exists(c(output.file.lon, output.file.lat,
output.file.lon2, output.file.lat2,
output.file.lli)))) {
# Get raster as df
r.df <- terra::as.data.frame(r.list[[state]], xy=T, cells=T) %>%
rename(lon=x, lat=y) %>%
select(lon, lat, cell) %>%
st_as_sf(coords = c("lon", "lat"), crs = st_crs(r.list[[state]])) %>%
st_transform(crs=4326) %>%
cbind(st_coordinates(.)) %>%
rename(lon="X", lat="Y")
# Initialize empty raster templates
r.lat <- ext(r.list[[state]]) %>%
rast(res=res(r.list[[state]]), crs=crs(r.list[[state]]))
r.lon <- ext(r.list[[state]]) %>%
rast(res=res(r.list[[state]]), crs=crs(r.list[[state]]))
# Fill with lat/lon values
r.lat[r.df$cell] <- r.df$lat
names(r.lat) <- "lat"
r.lon[r.df$cell] <- r.df$lon
names(r.lon) <- "lon"
# Get polynomial and interaction terms
r.lat2 <- r.lat^2
names(r.lat2) <- "lat.sqrd"
r.lon2 <- r.lon^2
names(r.lon2) <- "lon.sqrd"
r.lon.lat <- r.lon * r.lat
names(r.lon.lat) <- "lon.lat.interaction"
# Write outputs
writeRaster(r.lat, output.file.lat, overwrite=T)
writeRaster(r.lon, output.file.lon, overwrite=T)
writeRaster(r.lat2, output.file.lat2, overwrite=T)
writeRaster(r.lon2, output.file.lon2, overwrite=T)
writeRaster(r.lon.lat, output.file.lli, overwrite=T)
}
}
Digital Elevation Models (DEM) give us a detailed overview of the terrain elevation. However, sometimes, larger spatial trends (e.g., the general increase in elevation from the coast to the mountains) can overshadow more localized features or variations. By de-trending the DEM, the underlying larger spatial trend is removed, highlighting only the local elevation differences. In the given section, a linear model is fitted to the DEM using latitude and longitude (and their polynomial terms) to capture the broader spatial trends. The residuals from this model represent the local variations in elevation, devoid of the overarching trend. These de-trended values are then scaled between 0 and 1 to create a standardized representation of the localized elevation changes for each state.
for (state in states) {
output.file <- file.path(output.dir, paste0(state, "_detrend_dem.tif"))
if (!file.exists(output.file)) {
# Get raster as df
r.df <- terra::as.data.frame(r.list[[state]], xy=T, cells=T) %>%
rename(lon=x, lat=y) %>%
select(lon, lat, cell, !!sym(paste0("dem_", state))) %>%
st_as_sf(coords = c("lon", "lat"), crs = st_crs(r.list[[state]])) %>%
st_transform(crs=4326) %>%
cbind(st_coordinates(.)) %>%
rename(lon="X", lat="Y", dem=paste0("dem_", state)) %>%
# Convert to data.table
setDT()
# Fit model based on location, with dem as response
fit <- lm(dem ~ lat * lon + I(lat^2) + I(lon^2),
data=r.df[!is.na(dem)])
# Get residuals from the model as "de-trended" dem values
r.df[!is.na(dem), dem.detrended := residuals(fit)]
# Scale de-trended values
r.df[, dem.detrended := (dem.detrended - min(dem.detrended, na.rm=T)) /
(max(dem.detrended, na.rm=T) - min(dem.detrended, na.rm=T))]
# Initialize empty raster template
r.dem <- ext(r.list[[state]]) %>%
rast(res=res(r.list[[state]]), crs=crs(r.list[[state]]))
# Fill with de-trended dem values
r.dem[r.df$cell] <- r.df$dem.detrended
names(r.dem) <- "dem.detrended"
# Write outputs
writeRaster(r.dem, output.file, overwrite=T)
}
}
Feature selection is the process of determining the most relevant input variables to use in modeling or analysis. By selecting the most informative features, one can improve the model’s accuracy, reduce overfitting, reduce issues due to multi-collinearity and spatial autocorrelation, and decrease the computational cost of training.
In this section, the final dataset that will be used for feature selection is prepared. This involves:
final.output.dir <- "artifacts/final_data"
if (!dir.exists(final.output.dir)) dir.create(final.output.dir)
final.fpath <- file.path(final.output.dir, "final_data.rds")
if (!file.exists(final.fpath)) {
# Combine observation/pseudo-absence data
df <- c(list.files(file.path("data", "final_pseudo_absence"), full.names = T),
list.files(file.path("data", "final"), full.names = T)) %>%
purrr::map_df(readRDS)
# Get feature engineered raster data
output.dir <- "artifacts/feature_engineered_final"
if (!dir.exists(output.dir)) dir.create(output.dir)
get.fe <- all(case_when(is_empty(list.files(output.dir)) ~ T,
list.files(output.dir) < length(states) ~ T,
T ~ F))
fe.r.list <- list()
if (get.fe) {
for (state in states) {
fe.r.list[[state]] <- list.files(
"artifacts/feature_engineering", full.names = T)[
grepl(
paste0("^", state, "_"),
list.files("artifacts/feature_engineering", full.names = F)
)
] %>% rast()
# Save as multi-layer rasters by state
writeRaster(fe.r.list[[state]],
file.path(output.dir, paste0(state, ".tif")),
overwrite=T)
}
} else {
for (state in states) {
fe.r.list[[state]] <- rast(file.path(output.dir, paste0(state, ".tif")))
}
}
# Add empty fields in dataframe for each new raster
purrr::map(fe.r.list, names) %>%
reduce(c) %>%
unique() %>%
sort() %>%
purrr::walk(function(n) {
if (!(n %in% names(df))) df[[n]] <<- 0
})
# Update point crs to match fe rasters
df <- st_transform(df, st_crs(fe.r.list[[1]]))
df.list <- list()
# From state multi-layer rasters, extract values from each point in df
for (st in states) {
cat(sprintf("Extracting points to values for %s...\n", st))
r <- fe.r.list[[st]]
df.list[[st]] <- df %>% filter(state == st)
# Extract raster values
for (r.name in names(r)) {
cat("\tExtracting", r.name, "values for", st, "\n")
x <- terra::extract(r[[r.name]], df.list[[st]])[[r.name]]
df.list[[st]] <- mutate(df.list[[st]], !!r.name := x)
}
}
# Combine list
df <- do.call("rbind", df.list)
rm(df.list)
gc()
# Update crs back
df <- st_transform(df, 4326)
# Remove rownames
rownames(df) <- NULL
# Convert land cover to binary variables
binary.lc <- caret::dummyVars(~NLCD_Land, data=df, sep="") %>%
predict(., df) %>%
as.data.frame()
names(binary.lc) <- gsub("NLCD_Land", "", names(binary.lc)) %>%
gsub("\\/| |,", "_", .) %>%
gsub("__", "_", .) %>%
tolower()
# Remove duplicates from feature engineering
binary.lc <- binary.lc %>%
select(-c("unclassified", "perennial_snow_ice", "barren_land",
"shrub_scrub", "herbaceous"))
# Combine with dataframe, remove land cover categorical var
df <- df %>% select(-NLCD_Land) %>% cbind(binary.lc)
saveRDS(df, final.fpath)
} else {
df <- readRDS(final.fpath)
}
# Convert to data.table
df %>% setDT()
# View output
df %>% as_tibble()
In this section, correlation between the predictors and the dependent variable (observations) for each species are examined. The correlation measure assists in identifying which predictors have a linear relationship with the response variable and can, therefore, impact model predictions.
obs.cor <- purrr::map(sort(unique(df$common.name)), function(spec) {
corr.matrix <- cor(dplyr::select(df[common.name == spec],
-c("state", "common.name", "geometry")))
obs.cor <- corr.matrix[which(rownames(corr.matrix) == "observations"),] %>%
as.data.frame()
obs.cor$variable <- rownames(obs.cor)
obs.cor %>%
filter(variable != "observations") %>%
rename(!!spec := `.`) %>%
arrange(-abs(!!sym(spec))) %>%
mutate(!!spec := paste0(variable, ": ", signif(!!sym(spec), 2))) %>%
dplyr::select(!!sym(spec))
}) %>%
do.call("cbind", .)
obs.cor %>% as_tibble()
Ensuring the quality and robustness of a model requires that it is evaluated on unseen data. Similarly, it is important to avoid “data leakage”, which occurs when information from the test set inadvertently influences the training process. Reasons for data leakage can range from pre-processing steps being applied to the entire dataset instead of separately to train and test sets, to more subtle causes like target leakage where future information inadvertently gets used in the past due to data preparation mistakes.
In this section, the data is split into a training set and a test set. The training set is used for training the model, while the test set is reserved to evaluate the model’s performance. The data is stratified based on latitude, longitude, species, state, and absence to ensure that the distribution of these variables is consistent between the train and test sets. This stratification helps maintain representative samples and mitigates potential biases.
# Set seed for splitting and modeling
set.seed(19)
stratified.split.idx <- function(df, p=0.7, lat.lon.bins=25) {
# Cut along lat/lon values to create grids (lat.bin & lon.bin)
# lat.lon.bins is the number of divisions you want
df$lat.bin <- cut(df$lat, breaks=lat.lon.bins, labels = F)
df$lon.bin <- cut(df$lon, breaks=lat.lon.bins, labels = F)
df$absence <- df$observations == 0
# Create a new variable combining the stratification variables
df %>%
# stratify on lat/lon bins, species, state, and absence
mutate(strata = paste(lat.bin, lon.bin, common.name, state, absence)) %>%
pull(strata) %>%
# Create the data partitions
createDataPartition(., p = p, list = F) %>%
suppressWarnings()
}
prepare.data <- function(df, p=.7, lat.lon.bins=25) {
train.index <- stratified.split.idx(df, p=p, lat.lon.bins = lat.lon.bins)
df.train <- df[train.index, ]
df.test <- df[-train.index, ]
list(train = df.train,
test = df.test,
index = train.index)
}
# Get train/test indices
train.test <- prepare.data(df, .7)
# Split; Remove non-predictive variables
df.train <- df[train.test$index,]
df.train[, `:=` (geometry=NULL)]
df.test <- df[-train.test$index,]
df.test[, `:=` (geometry=NULL)]
train.x <- df.train %>% dplyr::select(-observations)
train.y <- df.train$observations
test.x <- df.test %>% dplyr::select(-observations)
test.y <- df.test$observations
For computational efficiency, models are cached. This section provides a function to retrieve the model from cache if it exists or save it to the cache if it doesn’t. This approach speeds up the modeling process, especially when iterating and fine-tuning various models, by avoiding retraining on the same dataset.
# Get model from cache if it has been saved before
get.model <- function(model, file.name, model.path) {
f.path <- file.path(model.path, file.name)
if (!dir.exists(model.path)) {
dir.create(model.path)
}
# Model cache check
if (ifelse(file.exists(f.path), T, F)) {
model <- readRDS(f.path)
} else {
saveRDS(model, f.path)
}
model
}
LASSO (Least Absolute Shrinkage and Selection Operator) regression is a type of linear regression that uses shrinkage. Here, data values are shrunk towards a central point, like the mean. The lasso procedure encourages simple, sparse models (i.e., models with fewer parameters).
In this section, a LASSO regression model is fitted for each species in each state. The regularization strength is controlled by the lambda parameter, where a value of zero results in regular linear regression and increasing values result in more regularization. By fitting LASSO models and examining the coefficients, we can determine the importance of each variable and variable interaction. Variables/interactions with non-zero coefficients are considered important, while those with coefficients shrunk to zero are considered non-informative.
species <- sort(unique(df.train$common.name))
if (!dir.exists("artifacts/models")) dir.create("artifacts/models")
# Define the control for the train function
ctrl <- trainControl(method = "cv", number = 5)
lasso.list <- purrr::map(species, function(spec) {
spec.state.fit <- purrr::map(states, function(st) {
cat("Fitting LASSO model for", spec, "in", st, "\n")
spec.df <- df.train[common.name == spec & state == st][
, `:=` (state=NULL, common.name = NULL)]
# Remove any columns where all values are the same
.remove <- names(which(sapply(spec.df, function(x) length(unique(x)) <= 1)))
.remove <- .remove[.remove != "observations"]
if (!is_empty(.remove)) {
spec.df <- spec.df %>% dplyr::select(-.remove)
}
# Scale data
# Identify fields to center/scale
to.scale <- sapply(spec.df, function(x) is.numeric(x) &&
length(unique(x)) > 2)
to.scale$observations <- F
to.scale <- names(spec.df) %in% names(which(unlist(to.scale)))
# Define the pre-processing steps (use the training data to avoid data leakage)
# Apply centering and scaling to the non-binary fields and non-target
preproc <- preProcess(spec.df[, ..to.scale],
method = c("center", "scale"))
# Center/Scale the training data
df.train.scaled <- bind_cols(spec.df[, !(..to.scale)],
predict(preproc, spec.df[, ..to.scale]))
# LASSO (L1); Elastic Net, where alpha = 1
fit.l1 <- get.model(
train(observations ~ (.)^2,
data = df.train.scaled,
method = "glmnet",
trControl = ctrl,
tuneGrid = expand.grid(alpha = 1,
lambda = seq(0, 1, by = 0.1)),
metric = "RMSE"),
file.name=paste0(tolower(gsub(" ", "_", spec)), "_", st, "_regr_l1.rds"),
model.path="artifacts/models/lasso_fs")
gc()
fit.l1
})
names(spec.state.fit) <- states
spec.state.fit
})
names(lasso.list) <- species
# Get variable importance for LASSO models, based on coefficients
spec.state <- expand.grid(common.name=species, state=states, stringsAsFactors=F)
lasso.imp <- purrr::map_df(1:nrow(spec.state), function(i) {
spec <- spec.state[i,]$common.name
st <- spec.state[i,]$state
fit <- lasso.list[[spec]][[st]]
coef.df <- coef(fit$finalModel, s = fit$bestTune$lambda) %>%
as.matrix() %>%
as.data.frame()
# Remove the intercept
coef.df <- coef.df[-1, , drop = F]
# Create a data frame of variable importance
var.importance <- tibble(
common.name = spec,
state = st,
variable = rownames(coef.df),
importance = abs(coef.df[,1])
) %>%
# Rank variables by importance
arrange(state, common.name, -importance, variable) %>%
# Only look at variables where imp. > 0
filter(importance > 0)
})
lasso.imp
Decision trees are a class of machine learning models that recursively split the dataset into subsets based on the value of input variables. The goal is to make the data within each subset as homogeneous as possible regarding the target variable. The importance of a variable in a decision tree model can be assessed by the frequency and depth at which it is used to split the data. Variables used frequently and closer to the root of the tree are typically considered more important.
The primary advantage of using a tree-based model as opposed to a model like LASSO is that the relationships do not need to be linear, and interactions are naturally accounted for due to the hierarchical nature of the model itself.
In the code below, Decision Tree models are trained for each species in each state. The variable importance is extracted from the tree structure itself, which offers an intuitive understanding of the relationships and hierarchies within the data.
# Decision Tree Variable Importance
tree.list <- purrr::map(species, function(spec) {
spec.state.fit <- purrr::map(states, function(st) {
cat("Fitting Decision Tree model for", spec, "in", st, "\n")
spec.df <- df.train[common.name == spec & state == st][
, `:=` (state=NULL, common.name = NULL)]
# Remove any columns where all values are the same
.remove <- names(which(sapply(spec.df, function(x) length(unique(x)) <= 1)))
.remove <- .remove[.remove != "observations"]
if (!is_empty(.remove)) {
spec.df <- spec.df %>% dplyr::select(-.remove)
}
# Fit the Decision Tree model
fit.tree <- get.model(
rpart::rpart(observations ~ ., data=spec.df, method="anova",
control=rpart::rpart.control(cp=0.001)),
file.name=paste0(tolower(gsub(" ", "_", spec)), "_", st, "_tree.rds"),
model.path="artifacts/models/trees_fs")
gc()
fit.tree
})
names(spec.state.fit) <- states
spec.state.fit
})
names(tree.list) <- species
# Get variable importance for decision tree models
tree.imp <- purrr::map_df(1:nrow(spec.state), function(i) {
spec <- spec.state[i,]$common.name
st <- spec.state[i,]$state
fit <- tree.list[[spec]][[st]]
vi <- fit$variable.importance
# Create a data frame of variable importance
var.importance <- tibble(
common.name = spec,
state = st,
variable = names(vi),
importance = (vi - min(vi)) / (max(vi) - min(vi))
) %>%
# Rank variables by importance
arrange(state, common.name, -importance, variable)
})
tree.imp
Predictive modeling is iterative by nature, and thus conclusively selecting features to be included in the modeling process is impractical. The purpose of this exploratory analysis is just that - exploratory. The insights and results of this part of the study will be utilized in the iterative modeling process, and adjusted as needed.